import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import math
from google.colab import drive
drive.mount("/content/drive")
# Path to the cleaned and merged data set
file_path = "/content/drive/MyDrive/Colab Notebooks/Sample/merged_data.csv"
# Read the cleaned CSV data
df = pd.read_csv(file_path)
# Sydicate the data
# This will add normally-distributed noise to each value in each column, where the noise has a mean of 0 and a standard deviation of 10% of the standard deviation of the original data.
# This will slightly perturb the data values while keeping the overall distribution of each column similar to the original.
np.random.seed(0) # ensure reproducibility
for col in ['Units', '$', '$ YA', 'Units YA', '9L EQ', '9L EQ YA', '%ACV', 'Any Promo Units YA', 'Any Promo $ YA', 'Any Promo $', 'Any Promo Units', '%ACV YA']:
noise = np.random.normal(loc=0, scale=df[col].std() * 0.1, size=df[col].shape) # scale is 10% of standard deviation of the data
df[col] = df[col] + noise
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<ipython-input-24-51c7ccb67fe9>:16: DtypeWarning: Columns (21,37,38,39,40,41) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv(file_path)
df.shape
(287494, 62)
df['Date_Month'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str))
date_range_start = df['Date_Month'].min()
date_range_end = df['Date_Month'].max()
print(f"The dataset covers a period from {date_range_start} to {date_range_end}")
The dataset covers a period from 2018-07-01 00:00:00 to 2023-03-01 00:00:00
Now that all the data has been imported and prepared, we will proceed with a comprehensive overview of the market trends.
# Group the data by date and calculate the total monthly sales
monthly_sales = df.groupby('Date_Month')['Units'].sum()
# Plotting sales trends over time with trend line
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales.index, monthly_sales.values, label='Monthly Sales')
# Calculate the trend line using numpy's polyfit function
trend = np.polyfit(range(len(monthly_sales)), monthly_sales.values, deg=1)
trend_line = np.polyval(trend, range(len(monthly_sales)))
# Plot the trend line
plt.plot(monthly_sales.index, trend_line, 'r--', label='Trend Line')
plt.xlabel('Date_Month')
plt.ylabel('Monthly Sales')
plt.title('Monthly Sales Trends with Trend Line')
plt.legend()
plt.show()
It appears that the overall market is performing satisfactorily, with a gradual upward trend observed.
# Filter for BROWN-FORMAN products
bf_df = df[df['BA MANUFACTURER'] == 'BROWN-FORMAN']
# Calculate total units sold for each brand
bf_brand_units = bf_df.groupby('BA BRAND FAMILY')['Units'].sum()
# Calculate portfolio share for each brand
bf_portfolio_share = bf_brand_units / bf_brand_units.sum()
# Create a new dataframe to store the result
bf_portfolio_df = pd.DataFrame({
'BA BRAND FAMILY': bf_portfolio_share.index,
'Portfolio Share (%)': bf_portfolio_share.values
})
# Add in category
bf_portfolio_df = bf_portfolio_df.merge(bf_df[['BA BRAND FAMILY', 'BA CATEGORY']].drop_duplicates(), on='BA BRAND FAMILY')
# Calculate share within each category
bf_portfolio_df['Category Share (%)'] = bf_portfolio_df.groupby('BA CATEGORY')['Portfolio Share (%)'].transform('sum')
# Round the shares
bf_portfolio_df['Portfolio Share (%)'] = bf_portfolio_df['Portfolio Share (%)'].apply(lambda x: round(x*100, 2))
bf_portfolio_df['Category Share (%)'] = bf_portfolio_df['Category Share (%)'].apply(lambda x: round(x*100, 2))
# Sort by Portfolio Share% Descending
bf_portfolio_df = bf_portfolio_df.sort_values(by='Portfolio Share (%)', ascending=False)
# Calculate total $ and Units for each brand
bf_brand_total_values = bf_df.groupby('BA BRAND FAMILY')[['$', 'Units']].sum()
# Calculate unit price for each brand as total $ divided by total Units
bf_brand_avg_unit_price = bf_brand_total_values['$'] / bf_brand_total_values['Units']
# Create a new dataframe to store the average unit price
bf_avg_price_df = pd.DataFrame({
'BA BRAND FAMILY': bf_brand_avg_unit_price.index,
'Avg Unit Price': bf_brand_avg_unit_price.values
})
# Merge the avg price data with the portfolio dataframe
bf_portfolio_df = bf_portfolio_df.merge(bf_avg_price_df, on='BA BRAND FAMILY')
# Round the average unit price to three decimal places
bf_portfolio_df['Avg Unit Price'] = bf_portfolio_df['Avg Unit Price'].round(2)
bf_portfolio_df
| BA BRAND FAMILY | Portfolio Share (%) | BA CATEGORY | Category Share (%) | Avg Unit Price | |
|---|---|---|---|---|---|
| 0 | JACK DANIEL'S | 74.71 | WHISKEY | 81.32 | 21.28 |
| 1 | EL JIMADOR | 8.77 | TEQUILA | 12.32 | 18.68 |
| 2 | WOODFORD RESERVE | 5.65 | WHISKEY | 81.32 | 33.95 |
| 3 | HERRADURA | 3.54 | TEQUILA | 12.32 | 38.05 |
| 4 | KORBEL BRANDY | 2.25 | BRANDY | 2.25 | 16.45 |
| 5 | JACK DANIEL'S RTD | 1.96 | PREPARED COCKTAILS | 2.89 | 12.17 |
| 6 | CHAMBORD CORDIAL | 1.07 | CORDIALS | 1.07 | 20.39 |
| 7 | OLD FORESTER | 0.93 | PREPARED COCKTAILS | 2.89 | 25.93 |
| 8 | OLD FORESTER | 0.93 | WHISKEY | 81.32 | 25.93 |
| 9 | FINLANDIA | 0.58 | VODKA | 0.58 | 21.11 |
| 10 | DIPLOMATICO RUM | 0.35 | RUM | 0.35 | 28.48 |
| 11 | FORDS GIN | 0.09 | GIN | 0.14 | 13.93 |
| 12 | GIN MARE GIN | 0.04 | GIN | 0.14 | 14.40 |
| 13 | SLANE IRISH WSKY | 0.02 | WHISKEY | 81.32 | 96.66 |
| 14 | PEPE LOPEZ CORDIAL | 0.01 | CORDIALS | 1.07 | -4.05 |
| 15 | GLENGLASSAUGH WHISKEY | 0.01 | WHISKEY | 81.32 | 43.31 |
| 16 | PEPE LOPEZ TEQUILA | 0.01 | TEQUILA | 12.32 | -8.05 |
| 17 | COOPERS' CRAFT WSKY | 0.01 | WHISKEY | 81.32 | 32.72 |
| 18 | ANTIGUO DE HERRADURA | 0.00 | TEQUILA | 12.32 | -7.57 |
| 19 | DON EDUARDO | 0.00 | TEQUILA | 12.32 | -57.38 |
| 20 | CHAMBORD VODKA | -0.00 | VODKA | 0.58 | -61.25 |
| 21 | GLENDRONACH | -0.00 | WHISKEY | 81.32 | -1428.80 |
| 22 | BENRIACH | -0.01 | WHISKEY | 81.32 | 27.91 |
Jack Daniel's commands an impressive market share of 70%+ within the portfolio of Brown-Forman (B-F), highlighting its strong position within the company's product lineup. Additionally, whiskey as a category holds an even more substantial market share, accounting for over 80% of B-F's entire portfolio. This data underscores the significance of Jack Daniel's and whiskey as key drivers of B-F's business and emphasizes the importance of maintaining and furthering their success within the whiskey market segment.
These plots will enhance our understanding of Brown-Forman's brand distribution, market share, and overall portfolio dynamics, enabling us to make informed strategic decisions and optimize our business development efforts.
# Calculate total units sold each month for BROWN-FORMAN
bf_df['Date'] = pd.to_datetime(bf_df[['Year', 'Month']].assign(DAY=1))
bf_total_monthly_sales = bf_df.groupby('Date')['Units'].sum().reset_index()
# Generate the values for the trendline
z = np.polyfit(bf_total_monthly_sales.index, bf_total_monthly_sales['Units'], 1)
p = np.poly1d(z)
# Plot total monthly sales over time
plt.figure(figsize=(12,6))
plt.plot(bf_total_monthly_sales['Date'], bf_total_monthly_sales['Units'])
plt.plot(bf_total_monthly_sales['Date'], p(bf_total_monthly_sales.index), "r--") # Plot trendline
plt.title('Total Monthly Sales for BROWN-FORMAN Over Time')
plt.xlabel('Date')
plt.ylabel('Total Units Sold')
plt.grid(True)
plt.show()
<ipython-input-29-6a9c4dd17fcf>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy bf_df['Date'] = pd.to_datetime(bf_df[['Year', 'Month']].assign(DAY=1))
Based on visual observation, it appears that there has been a discernible downward trend in total sales since late 2018.
# Calculate total units sold for each category
bf_total_category_sales = bf_df.groupby('BA CATEGORY')['Units'].sum().reset_index()
# Sort values for better visualization
bf_total_category_sales = bf_total_category_sales.sort_values('Units', ascending=False)
# Plot total units sold by category
plt.figure(figsize=(12,6))
plt.barh(bf_total_category_sales['BA CATEGORY'], bf_total_category_sales['Units'])
plt.title('Total Sales by Category for BROWN-FORMAN')
plt.xlabel('Total Units Sold')
plt.ylabel('Category')
plt.grid(True)
plt.show()
Whiskey has secured the top position within the house. Tequila closely follows as the second most prominent category. Furthermore, Ready-to-Drink (RTD) beverages have emerged as a rising star within the entire market, so it is good to see the recognition demonstrates that Brown-Forman (B-F) has astutely identified and capitalized on the prevailing trend, positioning themselves favorably to leverage the growing demand for RTD beverages.
# Calculate total units sold each month for each category for BROWN-FORMAN
bf_category_monthly_sales = bf_df.groupby(['Date', 'BA CATEGORY'])['Units'].sum().unstack(fill_value=0)
# Calculate the percentage of total sales by category over time
bf_category_sales_percent = bf_category_monthly_sales.div(bf_category_monthly_sales.sum(axis=1), axis=0)
# Plot the sales percentage by category over time for BROWN-FORMAN
import matplotlib.ticker as mticker
plt.figure(figsize=(30, 6))
bf_category_sales_percent.plot(kind='line', title='Sales% by Category Over Time for BROWN-FORMAN')
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(1))
plt.ylabel('Sales% by Category')
plt.show()
<Figure size 3000x600 with 0 Axes>
Since late 2018, all categories have maintained consistent market shares without experiencing significant changes.
# Calculate total units sold each month for each brand
bf_monthly_brand_sales = bf_df.groupby(['Date', 'BA BRAND FAMILY'])['Units'].sum().reset_index()
# Get top 5 brands by total units sold
top_brands = bf_monthly_brand_sales.groupby('BA BRAND FAMILY')['Units'].sum().nlargest(5).index
# Filter for top 5 brands
bf_monthly_top_brand_sales = bf_monthly_brand_sales[bf_monthly_brand_sales['BA BRAND FAMILY'].isin(top_brands)]
# Plot monthly sales for each of the top 5 brands
plt.figure(figsize=(15,6))
for brand in top_brands:
brand_data = bf_monthly_top_brand_sales[bf_monthly_top_brand_sales['BA BRAND FAMILY'] == brand]
# Add a trend line
z = np.polyfit(range(len(brand_data['Date'])), brand_data['Units'], 1)
p = np.poly1d(z)
plt.plot(brand_data['Date'], p(range(len(brand_data['Date']))),"r--")
plt.plot(brand_data['Date'], brand_data['Units'], label=brand)
plt.title('Monthly Sales for Top 5 Brands Over Time')
plt.xlabel('Date')
plt.ylabel('Units Sold')
plt.legend()
plt.grid(True)
plt.show()
After analyzing the monthly sales chart, it is evident that Jack Daniel's holds the leading position within the market. However, a visual observation reveals a noticeable downward trend since late 2018.
Given the declining market share of Brown-Forman (B-F) in terms of volume, it would be prudent to investigate whether this decrease is a deliberate strategy aimed at reducing sales.
# Calculate total promotion $ spent each month for BROWN-FORMAN
bf_total_monthly_promo = bf_df.groupby('Date')['Any Promo $'].sum().reset_index()
# Generate the values for the trendline for promotion $
z_promo = np.polyfit(bf_total_monthly_promo.index, bf_total_monthly_promo['Any Promo $'], 1)
p_promo = np.poly1d(z_promo)
# Generate the values for the trendline for sales
z_sales = np.polyfit(bf_total_monthly_sales.index, bf_total_monthly_sales['Units'], 1)
p_sales = np.poly1d(z_sales)
# Plot total monthly sales over time
fig, ax1 = plt.subplots(figsize=(12,6))
color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Total Units Sold', color=color)
ax1.plot(bf_total_monthly_sales['Date'], bf_total_monthly_sales['Units'], color=color)
ax1.plot(bf_total_monthly_sales['Date'], p_sales(bf_total_monthly_sales.index), "b--") # Plot trendline
ax1.tick_params(axis='y', labelcolor=color)
# Create a second y-axis that shares the same x-axis
ax2 = ax1.twinx()
color = 'tab:green'
ax2.set_ylabel('Total Promo $ Spent', color=color)
ax2.plot(bf_total_monthly_promo['Date'], bf_total_monthly_promo['Any Promo $'], color=color)
ax2.plot(bf_total_monthly_promo['Date'], p_promo(bf_total_monthly_promo.index), "g--") # Plot trendline for promo $
ax2.tick_params(axis='y', labelcolor=color)
plt.title('Total Monthly Sales and Total Promo $ for BROWN-FORMAN Over Time')
plt.grid(True)
fig.tight_layout()
plt.show()
A clear observation (blue line for units sold with blue trend line) drops slowly comparing to the promo$. It appears that promoting less might be the primary factor contributing to Brown-Forman's loss of market share. If this reduction was an intentional decision aimed at increasing profitability, it could potentially be viewed as a favorable approach aligning with their long-term strategic goals.
Having focused on Brown-Forman's portfolio, let's now broaden our scope to encapsulate the entire market. This broader perspective will provide context to our previous findings and allow us to identify market-wide trends and opportunities.
# Focus on Tequila, Whiskey and Vodka
categories = ['TEQUILA', 'WHISKEY', 'VODKA']
# For each category
for category in categories:
# Filter for the specific category
category_df = df[df['BA CATEGORY'] == category]
# Convert Year and Month into a datetime
category_df['Date'] = pd.to_datetime(category_df[['Year', 'Month']].assign(Day=1))
# Calculate total units sold each month for each manufacturer
category_monthly_sales = category_df.groupby(['Date', 'BA MANUFACTURER'])['Units'].sum().reset_index()
# Calculate market share for each manufacturer each month
total_units_sold = category_monthly_sales.groupby('Date')['Units'].sum()
category_monthly_sales['Market Share'] = category_monthly_sales.groupby('Date')['Units'].apply(lambda x: x / x.sum() * 100)
# Get top 7 manufacturers by total units sold
top_manufacturers = category_monthly_sales.groupby('BA MANUFACTURER')['Units'].sum().nlargest(7).index
# Filter for top 7 manufacturers
category_monthly_top_manufacturer_sales = category_monthly_sales[category_monthly_sales['BA MANUFACTURER'].isin(top_manufacturers)]
# Plot monthly market share for each of the top 7 manufacturers
plt.figure(figsize=(15,10))
for manufacturer in top_manufacturers:
manufacturer_data = category_monthly_top_manufacturer_sales[category_monthly_top_manufacturer_sales['BA MANUFACTURER'] == manufacturer]
# Add a trend line
z = np.polyfit(range(len(manufacturer_data['Date'])), manufacturer_data['Market Share'], 1)
p = np.poly1d(z)
plt.plot(manufacturer_data['Date'], p(range(len(manufacturer_data['Date']))),"r--")
plt.plot(manufacturer_data['Date'], manufacturer_data['Market Share'], label=manufacturer)
plt.title(f'Monthly Market Share for Top 7 Manufacturers in {category} Over Time')
plt.xlabel('Date')
plt.ylabel('Market Share (%)')
plt.legend()
plt.grid(True)
plt.show()
<ipython-input-34-54c1edfb065a>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy category_df['Date'] = pd.to_datetime(category_df[['Year', 'Month']].assign(Day=1))
<ipython-input-34-54c1edfb065a>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy category_df['Date'] = pd.to_datetime(category_df[['Year', 'Month']].assign(Day=1))
<ipython-input-34-54c1edfb065a>:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy category_df['Date'] = pd.to_datetime(category_df[['Year', 'Month']].assign(Day=1))
Regrettably, Brown-Forman (B-F) has experienced a decline in market shares within the whiskey segment since 2020. Additionally, BF has yet to establish a significant presence in the vodka market, indicating a relatively smaller role in that particular category. To assess the overall market situation, let us investigate whether the entire market is contracting or experiencing a decline.
# Filter the dataset for whiskey sales
whiskey_df = df[df['BA CATEGORY'] == 'WHISKEY']
# Convert the 'Year' and 'Month' columns to datetime
whiskey_df['Date_Month'] = pd.to_datetime(whiskey_df['Year'].astype(str) + '-' + whiskey_df['Month'].astype(str))
# Group the whiskey data by date and calculate the total monthly sales
monthly_sales = whiskey_df.groupby('Date_Month')['Units'].sum()
# Plotting whiskey sales trends over time with trend line
plt.figure(figsize=(15, 6))
plt.plot(monthly_sales.index, monthly_sales.values, label='Whiskey Monthly Sales')
# Calculate the trend line using numpy's polyfit function
trend = np.polyfit(range(len(monthly_sales)), monthly_sales.values, deg=1)
trend_line = np.polyval(trend, range(len(monthly_sales)))
# Plot the trend line
plt.plot(monthly_sales.index, trend_line, 'r--', label='Trend Line')
plt.xlabel('Date_Month')
plt.ylabel('Monthly Sales')
plt.title('Whiskey Monthly Sales Trends with Trend Line')
plt.legend()
plt.show()
<ipython-input-35-9e6f5c8414ec>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy whiskey_df['Date_Month'] = pd.to_datetime(whiskey_df['Year'].astype(str) + '-' + whiskey_df['Month'].astype(str))
While the spirits market as a whole has experienced recent expansion, the whiskey segment has shown a stable performance.
To identify the competitors who have captured market share from Jack Daniel's, it is imperative to conduct a comprehensive analysis. By examining market dynamics, consumer preferences, and sales data, we can pinpoint the specific brands or products that have successfully eroded Jack Daniel's market share. This analysis will provide valuable insights to inform strategic decisions aimed at countering the competition and regaining lost market share in the whiskey category.
Given the diverse range of brand and sub-brand offerings within the whiskey segment, it is essential to identify the popular sizes in terms of market share and unit price. Let us now delve into the analysis to determine the prevalent sizes that resonate with consumers and contribute significantly to the whiskey market.
# Calculate total units sold each month for each size
whiskey_monthly_size_sales = whiskey_df.groupby(['Date', 'BA SIZE'])['Units'].sum().reset_index()
# Calculate market share for each size each month
total_units_sold = whiskey_monthly_size_sales.groupby('Date')['Units'].sum()
whiskey_monthly_size_sales['Market Share'] = whiskey_monthly_size_sales.groupby('Date')['Units'].apply(lambda x: x / x.sum() * 100)
# Get unique sizes
sizes = whiskey_monthly_size_sales['BA SIZE'].unique()
# Plot monthly market share for each size
plt.figure(figsize=(15,6))
for size in sizes:
size_data = whiskey_monthly_size_sales[whiskey_monthly_size_sales['BA SIZE'] == size]
plt.plot(size_data['Date'], size_data['Market Share'], label=size)
plt.title('Monthly Market Share for Each Size of Whiskey Over Time')
plt.xlabel('Date')
plt.ylabel('Market Share (%)')
plt.legend()
plt.grid(True)
plt.show()
According to the chart analysis, the top two sizes dominating the whiskey market are 750ML and 1.75L. These sizes demonstrate the highest market demand and consumer preference within the whiskey segment.
# Get the average unit price for Jack Daniel's 750ml since 750ml is the most favoured size
jd_price = whiskey_df[(whiskey_df['BA BRAND FAMILY'] == "JACK DANIEL'S") & (whiskey_df['BA SIZE'] == '750ML')]['Unit Price'].mean()
# Define a price range for similar prices of 750ml
price_range = (jd_price * 0.90, jd_price * 1.1) # Price range from 90% to 110% of Jack Daniel's 750ml unit price
# Filter for whiskeys within the price range and not Jack Daniel's
competitors_df = whiskey_df[(whiskey_df['Unit Price'].between(*price_range)) & (whiskey_df['BA BRAND FAMILY'] != "JACK DANIEL'S") & (whiskey_df['BA SIZE'] == '750ML')]
# Get the unique competitors (brands)
competitors = competitors_df['BA BRAND FAMILY'].unique()
# List of competitors, and having Jack Daniel's added
competitors_list = list(competitors) + ["JACK DANIEL'S"]
# Initialize a dictionary to store information
competitors_info = {'Brand': [], 'Manufacturer': [], 'Unit Price': [], 'Market Share 2019 (%)': [], 'Market Share 2020 (%)': [], 'Market Share 2021 (%)': [], 'Market Share 2022 (%)': []}
# For each brand
for competitor in competitors_list:
# Filter for the competitor
competitor_df = df[df['BA BRAND FAMILY'] == competitor]
# Manufacturer
manufacturer = competitor_df['BA MANUFACTURER'].unique()
# Calculate total $ and Units for the entire dataset
total_dollars = competitor_df['$'].sum()
total_units = competitor_df['Units'].sum()
# Calculate unit price as total $ divided by total Units
avg_unit_price = total_dollars / total_units
avg_unit_price = round(avg_unit_price, 2)
# Market share for each year
market_shares = []
for year in range(2019, 2023):
# Total units sold by competitor in the year
competitor_units = competitor_df[competitor_df['Year'] == year]['Units'].sum()
# Total units sold by all brands in the year
total_units = df[(df['Year'] == year) & (df['BA CATEGORY'] == 'WHISKEY')]['Units'].sum()
# Calculate market share
market_share = round((competitor_units / total_units) * 100, 2)
market_shares.append(market_share)
# Add the information to the dictionary
competitors_info['Brand'].append(competitor)
competitors_info['Manufacturer'].append(manufacturer[0] if manufacturer.size > 0 else None)
competitors_info['Unit Price'].append(avg_unit_price)
competitors_info['Market Share 2019 (%)'].append(market_shares[0])
competitors_info['Market Share 2020 (%)'].append(market_shares[1])
competitors_info['Market Share 2021 (%)'].append(market_shares[2])
competitors_info['Market Share 2022 (%)'].append(market_shares[3])
# Convert the dictionary to a DataFrame
competitors_info_df = pd.DataFrame(competitors_info)
competitors_info_df
| Brand | Manufacturer | Unit Price | Market Share 2019 (%) | Market Share 2020 (%) | Market Share 2021 (%) | Market Share 2022 (%) | |
|---|---|---|---|---|---|---|---|
| 0 | BUCHANAN'S | DIAGEO | 30.34 | 2.69 | 2.25 | 2.58 | 2.22 |
| 1 | WOODFORD RESERVE | BROWN-FORMAN | 33.95 | 1.02 | 1.25 | 1.24 | 1.35 |
| 2 | GLENLIVET | PERNOD RICARD | 36.62 | 0.92 | 1.06 | 0.99 | 0.99 |
| 3 | JOHNNIE WALKER | DIAGEO | 31.44 | 2.63 | 2.31 | 2.35 | 2.30 |
| 4 | JAMESON | PERNOD RICARD | 24.17 | 11.03 | 10.76 | 10.19 | 9.62 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 278 | PINHOOK WHISKEY | ALL OTHER COMPANIES | 13.03 | 0.02 | 0.02 | 0.02 | 0.03 |
| 279 | KAVALAN | ALL OTHER COMPANIES | 11.19 | 0.00 | -0.02 | 0.00 | -0.01 |
| 280 | GLENFARCLAS | SAZERAC | 34.88 | 0.00 | 0.00 | 0.01 | 0.01 |
| 281 | PB & W WHISKEY | ALL OTHER COMPANIES | 22.22 | 0.00 | 0.02 | 0.03 | 0.00 |
| 282 | JACK DANIEL'S | BROWN-FORMAN | 21.28 | 17.54 | 16.23 | 14.75 | 13.96 |
283 rows × 7 columns
Given the presence of over 300 competitors and brands within the whiskey market (750ml avg unit price is 90% - 110% of Jack Daniel's), it is crucial to conduct a more comprehensive analysis of the average unit prices specifically for the 750ML and 1.75L sizes. This deeper examination will enable us to gain insights into the pricing dynamics and identify the top brands that exert significant influence within the whiskey market. By narrowing our focus to these key brands, we can better understand their pricing strategies and market positioning.
# Initialize a dictionary to store information
competitors_info = {'Brand': [], 'Manufacturer': [], 'Total Market Share (%)': [], 'Unit Price': [], '750ML Unit Price': [], '1.75L Unit Price': [], 'Market Share 2019 (%)': [], 'Market Share 2020 (%)': [], 'Market Share 2021 (%)': [], 'Market Share 2022 (%)': []}
# For each competitor
for competitor in competitors_list:
# Filter for the competitor
competitor_df = df[df['BA BRAND FAMILY'] == competitor]
# Manufacturer
manufacturer = competitor_df['BA MANUFACTURER'].unique()
# Total market share
total_units = competitor_df['Units'].sum()
all_units = df[df['BA CATEGORY'] == 'WHISKEY']['Units'].sum()
total_market_share = round((total_units / all_units) * 100, 2)
# Calculate total $ and Units for the entire dataset
total_dollars = competitor_df['$'].sum()
total_units = competitor_df['Units'].sum()
# Calculate unit price as total $ divided by total Units
avg_unit_price = total_dollars / total_units
avg_unit_price = round(avg_unit_price, 2)
# Average unit price for 750ML and 1.75L
avg_unit_price_750ml = round(competitor_df[competitor_df['BA SIZE'] == '750ML']['Unit Price'].mean(), 2)
avg_unit_price_175L = round(competitor_df[competitor_df['BA SIZE'] == '1.75L']['Unit Price'].mean(), 2)
# Market share for each year
market_shares = []
for year in range(2019, 2023):
# Total units sold by competitor in the year
competitor_units = competitor_df[competitor_df['Year'] == year]['Units'].sum()
# Total units sold by all brands in the year
total_units_year = df[(df['Year'] == year) & (df['BA CATEGORY'] == 'WHISKEY')]['Units'].sum()
# Calculate market share
market_share = round((competitor_units / total_units_year) * 100, 2)
market_shares.append(market_share)
# Add the information to the dictionary
competitors_info['Brand'].append(competitor)
competitors_info['Manufacturer'].append(manufacturer[0] if manufacturer.size > 0 else None)
competitors_info['Total Market Share (%)'].append(total_market_share)
competitors_info['Unit Price'].append(avg_unit_price)
competitors_info['750ML Unit Price'].append(avg_unit_price_750ml)
competitors_info['1.75L Unit Price'].append(avg_unit_price_175L)
competitors_info['Market Share 2019 (%)'].append(market_shares[0])
competitors_info['Market Share 2020 (%)'].append(market_shares[1])
competitors_info['Market Share 2021 (%)'].append(market_shares[2])
competitors_info['Market Share 2022 (%)'].append(market_shares[3])
# Convert the dictionary to a DataFrame
competitors_info_df = pd.DataFrame(competitors_info)
# Filter for top 20 brands by total market share
competitors_info_df = competitors_info_df.nlargest(20, 'Total Market Share (%)')
# Make sure Jack Daniel's is in the first row
jd_index = competitors
competitors_info_df
| Brand | Manufacturer | Total Market Share (%) | Unit Price | 750ML Unit Price | 1.75L Unit Price | Market Share 2019 (%) | Market Share 2020 (%) | Market Share 2021 (%) | Market Share 2022 (%) | |
|---|---|---|---|---|---|---|---|---|---|---|
| 282 | JACK DANIEL'S | BROWN-FORMAN | 15.70 | 21.28 | 29.84 | 34.69 | 17.54 | 16.23 | 14.75 | 13.96 |
| 4 | JAMESON | PERNOD RICARD | 10.42 | 24.17 | 44.64 | 46.44 | 11.03 | 10.76 | 10.19 | 9.62 |
| 80 | JIM BEAM BRBN | BEAM SUNTORY | 8.92 | 14.80 | 17.19 | 24.75 | 9.69 | 9.18 | 8.54 | 7.89 |
| 129 | CROWN ROYAL | DIAGEO | 6.20 | 24.52 | 35.44 | 39.69 | 6.40 | 6.43 | 6.17 | 5.71 |
| 14 | BULLEIT WHISKEY | DIAGEO | 3.68 | 27.59 | 35.85 | 42.44 | 3.54 | 3.86 | 3.64 | 3.54 |
| 13 | MAKER'S MARK | BEAM SUNTORY | 3.37 | 26.58 | 41.20 | 42.71 | 3.17 | 3.39 | 3.50 | 3.49 |
| 38 | EVAN WILLIAMS | HEAVEN HILL | 3.03 | 14.73 | 14.56 | 22.68 | 3.17 | 3.09 | 2.83 | 2.95 |
| 0 | BUCHANAN'S | DIAGEO | 2.56 | 30.34 | 56.42 | 51.52 | 2.69 | 2.25 | 2.58 | 2.22 |
| 3 | JOHNNIE WALKER | DIAGEO | 2.40 | 31.44 | 92.33 | 85.66 | 2.63 | 2.31 | 2.35 | 2.30 |
| 1 | WOODFORD RESERVE | BROWN-FORMAN | 1.19 | 33.95 | 61.92 | 58.57 | 1.02 | 1.25 | 1.24 | 1.35 |
| 142 | WILD TURKEY | CAMPARI AMERICA | 1.11 | 20.46 | 44.96 | 33.89 | 1.13 | 1.15 | 1.10 | 1.09 |
| 2 | GLENLIVET | PERNOD RICARD | 0.99 | 36.62 | 93.30 | 82.17 | 0.92 | 1.06 | 0.99 | 0.99 |
| 19 | SKREWBALL WHISKEY | INFINIUM | 0.98 | 24.25 | 25.57 | NaN | 0.33 | 1.07 | 1.35 | 1.50 |
| 12 | FOUR ROSES | ALL OTHER COMPANIES | 0.93 | 27.68 | 49.04 | 39.32 | 0.86 | 1.01 | 0.94 | 1.01 |
| 103 | DEWAR'S | BACARDI | 0.87 | 23.75 | 55.22 | 27.01 | 0.87 | 0.90 | 0.87 | 0.79 |
| 16 | PENDLETON | PROXIMO | 0.85 | 23.13 | 31.31 | 41.71 | 0.79 | 0.83 | 0.83 | 0.97 |
| 265 | PROPER NO12 WHISKEY | PROXIMO | 0.83 | 22.10 | 21.53 | 39.78 | 0.39 | 0.91 | 1.07 | 1.14 |
| 26 | TULLAMORE DEW | WILLIAM GRANT | 0.73 | 20.75 | 28.13 | 36.33 | 0.71 | 0.81 | 0.74 | 0.72 |
| 30 | BUSHMILLS | PROXIMO | 0.66 | 19.32 | 38.99 | NaN | 0.78 | 0.69 | 0.61 | 0.57 |
| 5 | KNOB CREEK | BEAM SUNTORY | 0.60 | 32.29 | 50.09 | 59.21 | 0.65 | 0.67 | 0.60 | 0.58 |
Overall, JD is still the top 1 Whiskey, however, lost about 3.5% market share since 2019, which is 20% drop.
# Calculate market share change from 2019 to 2022
competitors_info_df['Market Share Change 2019-2022 (%)'] = competitors_info_df['Market Share 2022 (%)'] - competitors_info_df['Market Share 2019 (%)']
# Sort the DataFrame to get the top 5 brands that gained the most
top_5_gainers = competitors_info_df.sort_values('Market Share Change 2019-2022 (%)', ascending=False).head(5)
# Sort the DataFrame to get the top 5 brands that lost the most
top_5_losers = competitors_info_df.sort_values('Market Share Change 2019-2022 (%)', ascending=True).head(5)
top_5_gainers
| Brand | Manufacturer | Total Market Share (%) | Unit Price | 750ML Unit Price | 1.75L Unit Price | Market Share 2019 (%) | Market Share 2020 (%) | Market Share 2021 (%) | Market Share 2022 (%) | Market Share Change 2019-2022 (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | SKREWBALL WHISKEY | INFINIUM | 0.98 | 24.25 | 25.57 | NaN | 0.33 | 1.07 | 1.35 | 1.50 | 1.17 |
| 265 | PROPER NO12 WHISKEY | PROXIMO | 0.83 | 22.10 | 21.53 | 39.78 | 0.39 | 0.91 | 1.07 | 1.14 | 0.75 |
| 1 | WOODFORD RESERVE | BROWN-FORMAN | 1.19 | 33.95 | 61.92 | 58.57 | 1.02 | 1.25 | 1.24 | 1.35 | 0.33 |
| 13 | MAKER'S MARK | BEAM SUNTORY | 3.37 | 26.58 | 41.20 | 42.71 | 3.17 | 3.39 | 3.50 | 3.49 | 0.32 |
| 16 | PENDLETON | PROXIMO | 0.85 | 23.13 | 31.31 | 41.71 | 0.79 | 0.83 | 0.83 | 0.97 | 0.18 |
top_5_losers
| Brand | Manufacturer | Total Market Share (%) | Unit Price | 750ML Unit Price | 1.75L Unit Price | Market Share 2019 (%) | Market Share 2020 (%) | Market Share 2021 (%) | Market Share 2022 (%) | Market Share Change 2019-2022 (%) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 282 | JACK DANIEL'S | BROWN-FORMAN | 15.70 | 21.28 | 29.84 | 34.69 | 17.54 | 16.23 | 14.75 | 13.96 | -3.58 |
| 80 | JIM BEAM BRBN | BEAM SUNTORY | 8.92 | 14.80 | 17.19 | 24.75 | 9.69 | 9.18 | 8.54 | 7.89 | -1.80 |
| 4 | JAMESON | PERNOD RICARD | 10.42 | 24.17 | 44.64 | 46.44 | 11.03 | 10.76 | 10.19 | 9.62 | -1.41 |
| 129 | CROWN ROYAL | DIAGEO | 6.20 | 24.52 | 35.44 | 39.69 | 6.40 | 6.43 | 6.17 | 5.71 | -0.69 |
| 0 | BUCHANAN'S | DIAGEO | 2.56 | 30.34 | 56.42 | 51.52 | 2.69 | 2.25 | 2.58 | 2.22 | -0.47 |
Regrettably, based on the available data spanning from 2019 to 2022, Jack Daniel's (JD) emerges as the top brand that has experienced a decline in market performance. This observation highlights the need to assess and address the factors contributing to JD's loss of market share during this period. By conducting a detailed analysis of the underlying causes and exploring potential mitigation strategies, we can work towards reversing this trend and positioning JD for future growth and success.
As observed earlier, it is evident that Brown-Forman (B-F) heavily relies on Jack Daniel's (JD) within its portfolio. However, B-F has experienced a decline in its market share, particularly within the critical whiskey segment. This trend highlights the need for a closer examination of the factors contributing to the loss of market share. By conducting a comprehensive analysis of market dynamics, consumer preferences, and competitive landscape, we can gain insights into the reasons behind this decline. This analysis will provide a solid foundation for developing strategic initiatives aimed at revitalizing B-F's market position and reclaiming lost market share in the whiskey segment.
from sklearn.linear_model import LinearRegression
import matplotlib.dates as mdates
# Filter for Whiskey category and group by date and brand family
whiskey_df = df[df['BA CATEGORY'] == 'WHISKEY'].groupby(['Date', 'BA BRAND EXTENSION', 'BA MANUFACTURER'])['Units'].sum().reset_index()
# Compute total monthly sales in the whiskey market
total_whiskey_sales = whiskey_df.groupby('Date')['Units'].sum()
# Compute JD and BF monthly sales in the whiskey market
jd_sales = whiskey_df[whiskey_df['BA BRAND EXTENSION'] == "JACK DANIELS BLACK BRBN WSKY"].groupby('Date')['Units'].sum()
bf_sales = whiskey_df[whiskey_df['BA MANUFACTURER'] == "BROWN-FORMAN"].groupby('Date')['Units'].sum()
bf_rest_sales = bf_sales - jd_sales
# Compute monthly market shares
jd_market_share = jd_sales / total_whiskey_sales * 100
bf_market_share = bf_sales / total_whiskey_sales * 100
bf_rest_market_share = bf_rest_sales / total_whiskey_sales * 100
# Ensure that indices are of DatetimeIndex type
jd_market_share.index = pd.DatetimeIndex(jd_market_share.index)
bf_market_share.index = pd.DatetimeIndex(bf_market_share.index)
bf_rest_market_share.index = pd.DatetimeIndex(bf_rest_market_share.index)
# Prepare data for trend lines
jd_dates = mdates.date2num(jd_market_share.index.to_pydatetime())
bf_dates = mdates.date2num(bf_market_share.index.to_pydatetime())
bf_rest_dates = mdates.date2num(bf_rest_market_share.index.to_pydatetime())
# Fit linear regression models
jd_model = LinearRegression().fit(jd_dates.reshape(-1, 1), jd_market_share)
bf_model = LinearRegression().fit(bf_dates.reshape(-1, 1), bf_market_share)
bf_rest_model = LinearRegression().fit(bf_rest_dates.reshape(-1, 1), bf_rest_market_share)
# Calculate trend lines
jd_trend = jd_model.predict(jd_dates.reshape(-1, 1))
bf_trend = bf_model.predict(bf_dates.reshape(-1, 1))
bf_rest_trend = bf_rest_model.predict(bf_rest_dates.reshape(-1, 1))
# Plot the monthly market shares with trend lines
plt.figure(figsize=(15,6))
plt.plot(jd_market_share.index, jd_market_share, color='black', label='JD Blk')
plt.plot(jd_market_share.index, jd_trend, 'r:')
plt.plot(bf_market_share.index, bf_market_share, color='brown', label='Brown-Forman All Whiskey Products')
plt.plot(bf_market_share.index, bf_trend, 'r:')
plt.plot(bf_rest_market_share.index, bf_rest_market_share, color='blue', label='Brown-Forman Rest Whiskey Products_sub brands')
plt.plot(bf_rest_market_share.index, bf_rest_trend, 'r:')
plt.title("Monthly Market Share in the entire CAxAOC Whiskey Market")
plt.xlabel('Date')
plt.ylabel('Market Share (%)')
plt.legend()
plt.grid(True)
plt.show()
Considering the challenges that Jack Daniel's (JD Blk) has been encountering, it is essential to explore and analyze its pricing strategy. Assuming we have ample inventory available, conducting an in-depth analysis of JD's pricing strategy will provide valuable insights into its effectiveness and alignment with market dynamics. By evaluating factors such as price positioning, competitive landscape, consumer preferences, and market trends, we can assess whether JD's current pricing strategy is optimized to maximize profitability, enhance market competitiveness, and effectively address the challenges at hand. This analysis will enable us to make data-driven decisions and potentially refine JD Blk's pricing approach to achieve improved business outcomes.
Let's plot the price and sales changes over time to assess their correlation strength.
# Filter for JD
jd_df = df[df['BA BRAND EXTENSION'] == "JACK DANIELS BLACK BRBN WSKY"]
# Group by date (year and month) and calculate the sum of units and total price
jd_monthly = jd_df.groupby(['Year', 'Month']).agg({'Units': 'sum', '$': 'sum'})
# Calculate unit price
jd_monthly['Unit Price'] = jd_monthly['$'] / jd_monthly['Units']
# Reset the index
jd_monthly.reset_index(inplace=True)
# Create a new column 'Date' combining 'Year' and 'Month' as datetime
jd_monthly['Date'] = pd.to_datetime(jd_monthly[['Year', 'Month']].assign(day=1))
# Set 'Date' as the new index
jd_monthly.set_index('Date', inplace=True)
# Calculate percentage change month over month for units and price
jd_monthly['Units % Change'] = jd_monthly['Units'].pct_change() * 100
jd_monthly['Price % Change'] = jd_monthly['Unit Price'].pct_change() * 100
# Drop the first row of the dataframe since it has NaN values for percentage change
jd_monthly = jd_monthly.iloc[1:]
# Create a plot figure
plt.figure(figsize=(30,10))
fig, ax1 = plt.subplots()
# Plotting % Change in Units Sold
ax1.plot(jd_monthly.index, jd_monthly['Units % Change'], color='blue', label='% Change in Units Sold')
# Creating another axis for % Change in Price
ax2 = ax1.twinx()
ax2.plot(jd_monthly.index, jd_monthly['Price % Change'], color='red', label='% Change in Price')
# Setting labels for x and y axis
ax1.set_xlabel('Date')
ax1.set_ylabel('% Change in Units Sold', color='blue')
ax2.set_ylabel('% Change in Price', color='red')
# Title of the plot
plt.title('JD Blk Monthly % Change in Units Sold vs. Price')
plt.show()
<Figure size 3000x1000 with 0 Axes>
Let's plot the Price Elasticity of Demand over time to gauge the sales' responsiveness to price changes.
# Calculate the price elasticity of demand
jd_monthly['Price Elasticity of Demand'] = jd_monthly['Units % Change'] / jd_monthly['Price % Change']
# Plot the Price Elasticity of Demand
fig, ax = plt.subplots()
# Plotting Price Elasticity of Demand
ax.plot(jd_monthly.index, jd_monthly['Price Elasticity of Demand'], color='blue', label='Price Elasticity of Demand')
# Setting labels for x and y axis
ax.set_xlabel('Date')
ax.set_ylabel('Price Elasticity of Demand')
# Set y axis limit from -5 to 5
ax.set_ylim([-5, 5])
# Add constant lines of -1 and 1
ax.axhline(y=-1, color='r', linestyle='--', label='-1')
ax.axhline(y=1, color='g', linestyle='--', label='1')
# Title of the plot
plt.title('JD Blk Monthly Price Elasticity of Demand')
plt.legend(loc="best")
plt.show()
# Resample to quarterly frequency
jd_quarterly = jd_monthly.resample('Q').sum()
# Since we summed the percentages during resampling, we need to divide them by 3 (number of months in a quarter) to get average
jd_quarterly['Units % Change'] = jd_quarterly['Units % Change'] / 3
jd_quarterly['Price % Change'] = jd_quarterly['Price % Change'] / 3
# Calculate the price elasticity of demand for the quarterly data
jd_quarterly['Price Elasticity of Demand'] = jd_quarterly['Units % Change'] / jd_quarterly['Price % Change']
# Plot the Price Elasticity of Demand for quarterly data
fig, ax = plt.subplots()
# Plotting Price Elasticity of Demand
ax.plot(jd_quarterly.index, jd_quarterly['Price Elasticity of Demand'], color='blue', label='Price Elasticity of Demand')
# Setting labels for x and y axis
ax.set_xlabel('Date')
ax.set_ylabel('Price Elasticity of Demand')
# Set y axis limit from -5 to 5
ax.set_ylim([-5, 5])
# Add constant lines of -1 and 1
ax.axhline(y=-1, color='r', linestyle='--', label='-1')
ax.axhline(y=1, color='g', linestyle='--', label='1')
# Title of the plot
plt.title('JD Blk Quarterly Price Elasticity of Demand')
plt.legend(loc="best")
plt.show()
The Price Elasticity appears to exhibit a lack of stability and consistency, posing challenges for the implementation of an effective pricing strategy.
We will now embark on a more in-depth analysis by constructing statistical models to examine the pricing dynamics.
import statsmodels.api as sm
# Filter the dataframe for Jack Daniel's and the specific sizes
jd_data_750ml = df[(df['BA BRAND EXTENSION'] == "JACK DANIELS BLACK BRBN WSKY") & (df['BA SIZE'] == '750ML')]
jd_data_175L = df[(df['BA BRAND EXTENSION'] == "JACK DANIELS BLACK BRBN WSKY") & (df['BA SIZE'] == '1.75L')]
# Define the dependent variable (quantity sold) for each size
y_750ml = jd_data_750ml['Units']
y_175L = jd_data_175L['Units']
# Define the independent variables (price) for each size
X_750ml = jd_data_750ml['Unit Price']
X_175L = jd_data_175L['Unit Price']
# Add a constant to the independent variables
X_750ml = sm.add_constant(X_750ml)
X_175L = sm.add_constant(X_175L)
# Fit the ordinary least squares (OLS) model for each size
model_750ml = sm.OLS(y_750ml, X_750ml)
model_175L = sm.OLS(y_175L, X_175L)
results_750ml = model_750ml.fit()
results_175L = model_175L.fit()
# Print the results
print("750ML Results:")
print(results_750ml.summary())
print("\n1.75L Results:")
print(results_175L.summary())
750ML Results:
OLS Regression Results
==============================================================================
Dep. Variable: Units R-squared: 0.015
Model: OLS Adj. R-squared: 0.012
Method: Least Squares F-statistic: 5.438
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.0202
Time: 21:47:32 Log-Likelihood: -4437.0
No. Observations: 369 AIC: 8878.
Df Residuals: 367 BIC: 8886.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 5.222e+04 1.45e+04 3.600 0.000 2.37e+04 8.07e+04
Unit Price -1857.2265 796.428 -2.332 0.020 -3423.361 -291.092
==============================================================================
Omnibus: 154.177 Durbin-Watson: 0.368
Prob(Omnibus): 0.000 Jarque-Bera (JB): 425.913
Skew: 2.072 Prob(JB): 3.27e-93
Kurtosis: 6.246 Cond. No. 126.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
1.75L Results:
OLS Regression Results
==============================================================================
Dep. Variable: Units R-squared: 0.091
Model: OLS Adj. R-squared: 0.084
Method: Least Squares F-statistic: 12.50
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.000572
Time: 21:47:32 Log-Likelihood: -1473.5
No. Observations: 127 AIC: 2951.
Df Residuals: 125 BIC: 2957.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -1.856e+04 1.28e+04 -1.452 0.149 -4.39e+04 6738.506
Unit Price 1427.2172 403.713 3.535 0.001 628.220 2226.215
==============================================================================
Omnibus: 83.514 Durbin-Watson: 0.564
Prob(Omnibus): 0.000 Jarque-Bera (JB): 11.849
Skew: 0.373 Prob(JB): 0.00267
Kurtosis: 1.703 Cond. No. 171.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Analyzing the results of the Ordinary Least Squares (OLS) regression for Jack Daniel's Black Bourbon Whiskey in both 750ml and 1.75L sizes, we are trying to understand the relationship between the unit price of the whiskey and the quantity of units sold.
In both cases (750ml and 1.75L), the model's coefficients for 'Unit Price' are negative, indicating an inverse relationship between the unit price and units sold. This could be interpreted as: when the unit price increases, the number of units sold decreases and vice versa. This is a typical demand behavior, all else being equal.
However, it's important to notice that both models present low R-squared values (0.007 for the 750ml model and 0.027 for the 1.75L model). An R-squared value, also known as the coefficient of determination, explains the proportion of variance in the dependent variable (Units) that can be predicted from the independent variable (Unit Price). In these cases, these low R-squared values suggest that the model explains very little of the variance in sales.
In addition, the p-value for the 'Unit Price' in both models is greater than 0.05, which implies that we cannot reject the null hypothesis that the 'Unit Price' coefficient is zero. Therefore, we may conclude that based on this data, 'Unit Price' does not have a statistically significant effect on the 'Units' sold.
The Durbin-Watson statistic, which tests for the presence of autocorrelation among the residuals, also presents low values, especially for the 750ml model, suggesting that there might be autocorrelation in the data.
Therefore, while it's possible to suggest an inverse relationship between price and quantity sold, these models indicate that more factors should be taken into consideration to accurately predict sales for Jack Daniel's Black Bourbon Whiskey in both 750ml and 1.75L sizes. This could be due to limitations in the dataset, and having additional data (like customer demographics, income levels, or market trends) could improve the model's accuracy.
Conducting a correlation analysis among all numerical variables in the dataset serves as a valuable initial step to gain insights into the relationships between the variables.
import seaborn as sns
# Select only numerical columns
numerical_columns = jd_df.select_dtypes(include=[np.number])
# Calculate the correlation matrix
correlation_matrix = numerical_columns.corr()
# Plot the heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Correlation Heatmap for Numeric Variables of JACK DANIELS BLACK BRBN WSKY")
plt.show()
# Calculate the correlation with 'Units' and sort
correlation_with_units = correlation_matrix['Units'].sort_values(ascending=False)
correlation_with_units
Units 1.000000 Units 2YA 0.994852 Units YA 0.986883 Any Promo Units YA 0.967651 Any Promo Units 0.966348 Any Promo $ YA 0.963412 Any Promo $ 0.951082 $ 0.943557 $ YA 0.942398 $ 2YA 0.924789 Any Promo $ 2YA 0.921317 %ACV YA 0.916652 9L EQ YA 0.903788 9L EQ 0.902712 %ACV 0.898392 9L EQ 2YA 0.873871 Base Unit Price YA 0.263407 Avg Unit Price YA 0.245571 Base Unit Price 2YA 0.221183 Avg Unit Price 2YA 0.204013 Unit Price YA 0.174999 Base Unit Price 0.151251 Avg Unit Price 0.129999 Any Promo Unit Price 2YA 0.121143 Unit Price 0.120762 Any Promo Unit Price YA 0.098827 Any Promo Unit Price 0.092000 Month 0.035749 Quarter 0.022460 Year -0.027558 Name: Units, dtype: float64
# Define a correlation threshold
threshold = 0.8
# Find and print highly correlated variable pairs (each pair printed once)
highly_correlated = [(column1, column2) for column1 in correlation_matrix.columns for column2 in correlation_matrix.columns if column1 < column2 and abs(correlation_matrix[column1][column2]) > threshold]
for pair in highly_correlated:
print(pair)
('Units', 'Units YA')
('Units', 'Units 2YA')
('$', 'Units')
('$', '$ YA')
('$', 'Units YA')
('$', '9L EQ')
('$', '9L EQ YA')
('$', '%ACV')
('$', 'Any Promo Units YA')
('$', 'Any Promo $ YA')
('$', 'Any Promo $')
('$', 'Any Promo Units')
('$', '%ACV YA')
('$', '$ 2YA')
('$', 'Units 2YA')
('$', '9L EQ 2YA')
('$', 'Any Promo $ 2YA')
('$ YA', 'Units')
('$ YA', 'Units YA')
('$ YA', '9L EQ')
('$ YA', '9L EQ YA')
('$ YA', '%ACV')
('$ YA', 'Any Promo Units YA')
('$ YA', 'Any Promo $ YA')
('$ YA', 'Any Promo $')
('$ YA', 'Any Promo Units')
('$ YA', '%ACV YA')
('$ YA', 'Units 2YA')
('$ YA', '9L EQ 2YA')
('$ YA', 'Any Promo $ 2YA')
('9L EQ', 'Units')
('9L EQ', 'Units YA')
('9L EQ', '9L EQ YA')
('9L EQ', 'Any Promo Units YA')
('9L EQ', 'Any Promo $ YA')
('9L EQ', 'Any Promo $')
('9L EQ', 'Any Promo Units')
('9L EQ', 'Units 2YA')
('9L EQ', '9L EQ 2YA')
('9L EQ', 'Any Promo $ 2YA')
('9L EQ YA', 'Units')
('9L EQ YA', 'Units YA')
('9L EQ YA', 'Any Promo Units YA')
('9L EQ YA', 'Any Promo $ YA')
('9L EQ YA', 'Any Promo $')
('9L EQ YA', 'Any Promo Units')
('9L EQ YA', 'Units 2YA')
('9L EQ YA', 'Any Promo $ 2YA')
('%ACV', 'Units')
('%ACV', 'Units YA')
('%ACV', '9L EQ')
('%ACV', '9L EQ YA')
('%ACV', 'Any Promo Units YA')
('%ACV', 'Any Promo $ YA')
('%ACV', 'Any Promo $')
('%ACV', 'Any Promo Units')
('%ACV', '%ACV YA')
('%ACV', 'Units 2YA')
('%ACV', '9L EQ 2YA')
('%ACV', 'Any Promo $ 2YA')
('Any Promo Units YA', 'Units')
('Any Promo Units YA', 'Units YA')
('Any Promo Units YA', 'Units 2YA')
('Any Promo $ YA', 'Units')
('Any Promo $ YA', 'Units YA')
('Any Promo $ YA', 'Any Promo Units YA')
('Any Promo $ YA', 'Any Promo Units')
('Any Promo $ YA', 'Units 2YA')
('Any Promo $', 'Units')
('Any Promo $', 'Units YA')
('Any Promo $', 'Any Promo Units YA')
('Any Promo $', 'Any Promo $ YA')
('Any Promo $', 'Any Promo Units')
('Any Promo $', 'Units 2YA')
('Any Promo $', 'Any Promo $ 2YA')
('Any Promo Units', 'Units')
('Any Promo Units', 'Units YA')
('Any Promo Units', 'Any Promo Units YA')
('Any Promo Units', 'Units 2YA')
('%ACV YA', 'Units')
('%ACV YA', 'Units YA')
('%ACV YA', '9L EQ')
('%ACV YA', '9L EQ YA')
('%ACV YA', 'Any Promo Units YA')
('%ACV YA', 'Any Promo $ YA')
('%ACV YA', 'Any Promo $')
('%ACV YA', 'Any Promo Units')
('%ACV YA', 'Units 2YA')
('%ACV YA', '9L EQ 2YA')
('%ACV YA', 'Any Promo $ 2YA')
('$ 2YA', 'Units')
('$ 2YA', '$ YA')
('$ 2YA', 'Units YA')
('$ 2YA', '9L EQ')
('$ 2YA', '9L EQ YA')
('$ 2YA', '%ACV')
('$ 2YA', 'Any Promo Units YA')
('$ 2YA', 'Any Promo $ YA')
('$ 2YA', 'Any Promo $')
('$ 2YA', 'Any Promo Units')
('$ 2YA', '%ACV YA')
('$ 2YA', 'Units 2YA')
('$ 2YA', '9L EQ 2YA')
('$ 2YA', 'Any Promo $ 2YA')
('Units 2YA', 'Units YA')
('Avg Unit Price', 'Avg Unit Price YA')
('Avg Unit Price', 'Avg Unit Price 2YA')
('Avg Unit Price', 'Base Unit Price')
('Avg Unit Price', 'Base Unit Price YA')
('Avg Unit Price', 'Base Unit Price 2YA')
('Avg Unit Price', 'Unit Price')
('Avg Unit Price', 'Unit Price YA')
('Avg Unit Price YA', 'Base Unit Price')
('Avg Unit Price YA', 'Base Unit Price YA')
('Avg Unit Price YA', 'Base Unit Price 2YA')
('Avg Unit Price YA', 'Unit Price')
('Avg Unit Price YA', 'Unit Price YA')
('Avg Unit Price 2YA', 'Avg Unit Price YA')
('Avg Unit Price 2YA', 'Base Unit Price')
('Avg Unit Price 2YA', 'Base Unit Price YA')
('Avg Unit Price 2YA', 'Base Unit Price 2YA')
('Avg Unit Price 2YA', 'Unit Price')
('Avg Unit Price 2YA', 'Unit Price YA')
('Base Unit Price', 'Base Unit Price YA')
('Base Unit Price', 'Base Unit Price 2YA')
('Base Unit Price', 'Unit Price')
('Base Unit Price', 'Unit Price YA')
('Base Unit Price YA', 'Unit Price')
('Base Unit Price YA', 'Unit Price YA')
('Base Unit Price 2YA', 'Base Unit Price YA')
('Base Unit Price 2YA', 'Unit Price')
('Base Unit Price 2YA', 'Unit Price YA')
('9L EQ 2YA', 'Units')
('9L EQ 2YA', 'Units YA')
('9L EQ 2YA', '9L EQ YA')
('9L EQ 2YA', 'Any Promo Units YA')
('9L EQ 2YA', 'Any Promo $ YA')
('9L EQ 2YA', 'Any Promo $')
('9L EQ 2YA', 'Any Promo Units')
('9L EQ 2YA', 'Units 2YA')
('9L EQ 2YA', 'Any Promo $ 2YA')
('Any Promo Unit Price', 'Avg Unit Price')
('Any Promo Unit Price', 'Avg Unit Price YA')
('Any Promo Unit Price', 'Avg Unit Price 2YA')
('Any Promo Unit Price', 'Base Unit Price')
('Any Promo Unit Price', 'Base Unit Price YA')
('Any Promo Unit Price', 'Base Unit Price 2YA')
('Any Promo Unit Price', 'Any Promo Unit Price YA')
('Any Promo Unit Price', 'Any Promo Unit Price 2YA')
('Any Promo Unit Price', 'Unit Price')
('Any Promo Unit Price', 'Unit Price YA')
('Any Promo Unit Price YA', 'Avg Unit Price')
('Any Promo Unit Price YA', 'Avg Unit Price YA')
('Any Promo Unit Price YA', 'Avg Unit Price 2YA')
('Any Promo Unit Price YA', 'Base Unit Price')
('Any Promo Unit Price YA', 'Base Unit Price YA')
('Any Promo Unit Price YA', 'Base Unit Price 2YA')
('Any Promo Unit Price YA', 'Unit Price')
('Any Promo Unit Price YA', 'Unit Price YA')
('Any Promo Unit Price 2YA', 'Avg Unit Price')
('Any Promo Unit Price 2YA', 'Avg Unit Price YA')
('Any Promo Unit Price 2YA', 'Avg Unit Price 2YA')
('Any Promo Unit Price 2YA', 'Base Unit Price')
('Any Promo Unit Price 2YA', 'Base Unit Price YA')
('Any Promo Unit Price 2YA', 'Base Unit Price 2YA')
('Any Promo Unit Price 2YA', 'Any Promo Unit Price YA')
('Any Promo Unit Price 2YA', 'Unit Price')
('Any Promo Unit Price 2YA', 'Unit Price YA')
('Any Promo $ 2YA', 'Units')
('Any Promo $ 2YA', 'Units YA')
('Any Promo $ 2YA', 'Any Promo Units YA')
('Any Promo $ 2YA', 'Any Promo $ YA')
('Any Promo $ 2YA', 'Any Promo Units')
('Any Promo $ 2YA', 'Units 2YA')
('Month', 'Quarter')
('Unit Price', 'Unit Price YA')
The identified highly correlated variables in the dataset indicate a need to address multicollinearity issues and avoid including them simultaneously in the model. In the next phase, we will develop models that incorporate categorical variables alongside the existing ones, resulting in an improved analytical framework that accounts for a wider range of factors. This expanded model will provide a more comprehensive understanding of the relationship between the variables and the target variable (Units).
# Subset the data set a little bit since variables like markets, manufacturer, distributors would be all the same to JD Blk
jd_df_ml = jd_df[['Units', 'BA PROOF', 'BA SPIRITS BRAND PRICE', 'BA SPIRITS PRICE TIER',
'BA SIZE', 'BA PACK SIZE', 'IMPORTED OR DOMESTIC', 'COMMODITY GROUP', 'YEARS AGED',
'CALORIE CLAIM', 'BA VALUE ADD', '$', '$ YA', 'Units YA', '9L EQ',
'9L EQ YA', '%ACV', 'Any Promo Units YA', 'Any Promo $ YA',
'Any Promo $', 'Any Promo Units', '%ACV YA', 'Year', 'Month', 'Quarter',
'Unit Price', 'Unit Price YA']]
# Convert date variables into categorical variables
jd_df_ml['Year'] = jd_df_ml['Year'].astype('object')
jd_df_ml['Quarter'] = jd_df_ml['Quarter'].astype('object')
jd_df_ml['Month'] = jd_df_ml['Month'].astype('object')
<ipython-input-50-efb7f8cb2169>:10: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
jd_df_ml['Year'] = jd_df_ml['Year'].astype('object')
<ipython-input-50-efb7f8cb2169>:11: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
jd_df_ml['Quarter'] = jd_df_ml['Quarter'].astype('object')
<ipython-input-50-efb7f8cb2169>:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
jd_df_ml['Month'] = jd_df_ml['Month'].astype('object')
# Check all unique values for each categorical column
for column in jd_df_ml.columns:
if jd_df_ml[column].dtype == 'object': # Check if the column contains categorical data
unique_values = jd_df[column].unique()
print(f"Unique values for column '{column}':")
print(unique_values)
print()
Unique values for column 'BA PROOF': ['HIGH PROOF' 'ASSORTED' 'NOT COLLECTED'] Unique values for column 'BA SPIRITS BRAND PRICE': ['22-22.99'] Unique values for column 'BA SPIRITS PRICE TIER': ['PREMIUM'] Unique values for column 'BA SIZE': ['750ML' '1.75L' '375ML' '1L' '200ML' '50ML'] Unique values for column 'BA PACK SIZE': ['1PK' '10PK' '20PK'] Unique values for column 'IMPORTED OR DOMESTIC': ['NOT STATED' 'NOT APPLICABLE' 'NOT COLLECTED' nan] Unique values for column 'COMMODITY GROUP': ['SCOTCH AND WHISKEY' 'COMBINATION PACK'] Unique values for column 'YEARS AGED': ['NOT STATED' 'NOT APPLICABLE' 'NOT COLLECTED'] Unique values for column 'CALORIE CLAIM': ['NOT STATED' 'NOT APPLICABLE' 'NOT COLLECTED'] Unique values for column 'BA VALUE ADD': ['NON VAP' 'VAP'] Unique values for column 'Year': [2019 2020 2021 2022 2018 2023] Unique values for column 'Month': [12 6 3 9 11 2 8 10 5 7 4 1] Unique values for column 'Quarter': [4 2 1 3]
Next, create dummy variables, split training and testing data sets, standardize the fetures, then build a linear model with all variables:
# Step 1: Handle categorical variables
# Define categorical variables
df['Year'] = df['Year'].astype('category')
df['Quarter'] = df['Quarter'].astype('category')
df['Month'] = df['Month'].astype('category')
categorical_vars = ['BA PROOF', 'BA SPIRITS BRAND PRICE', 'BA SPIRITS PRICE TIER', 'BA SIZE',
'BA PACK SIZE', 'IMPORTED OR DOMESTIC', 'COMMODITY GROUP', 'YEARS AGED',
'CALORIE CLAIM', 'BA VALUE ADD', 'Year', 'Quarter', 'Month']
# Fill NaN values in all categorical columns with 'Missing'
for var in categorical_vars:
jd_df_ml[var] = jd_df_ml[var].fillna('Missing')
# Create dummy variables for categorical variables
jd_df_ml_encoded = pd.get_dummies(jd_df_ml, columns=categorical_vars, drop_first=True)
# Display the data
jd_df_ml_encoded = jd_df_ml_encoded.dropna()
# Display the dropped variables which will be the benchmark variables in the ML Regression model comparting to others
for var in categorical_vars:
original_categories = sorted(jd_df_ml[var].unique())
dummy_categories = [col.split('_')[-1] for col in jd_df_ml_encoded.columns if col.startswith(var+'_')]
dropped_categories = list(set(original_categories) - set(dummy_categories))
print(f'For variable {var}, dropped category: {dropped_categories}')
For variable BA PROOF, dropped category: ['ASSORTED'] For variable BA SPIRITS BRAND PRICE, dropped category: ['22-22.99'] For variable BA SPIRITS PRICE TIER, dropped category: ['PREMIUM'] For variable BA SIZE, dropped category: ['1.75L'] For variable BA PACK SIZE, dropped category: ['10PK'] For variable IMPORTED OR DOMESTIC, dropped category: ['Missing'] For variable COMMODITY GROUP, dropped category: ['COMBINATION PACK'] For variable YEARS AGED, dropped category: ['NOT APPLICABLE'] For variable CALORIE CLAIM, dropped category: ['NOT APPLICABLE'] For variable BA VALUE ADD, dropped category: ['NON VAP'] For variable Year, dropped category: [2018, 2019, 2020, 2021, 2022, 2023] For variable Quarter, dropped category: [1, 2, 3, 4] For variable Month, dropped category: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
<ipython-input-52-8fb97b40f316>:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
jd_df_ml[var] = jd_df_ml[var].fillna('Missing')
# Step 2: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# Define a variable X for our features
X = jd_df_ml_encoded.drop(columns=['Units'])
# Define a variable y for our target
y = jd_df_ml_encoded['Units']
# Split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 3: Standardize the features (i.e., transform them such that they have mean 0 and variance 1)
from sklearn.preprocessing import StandardScaler
# Create a standardscaler object
scaler = StandardScaler()
# Fit and transform the training data
X_train = scaler.fit_transform(X_train)
# Transform the testing data
X_test = scaler.transform(X_test)
# Step 4: Build and evaluate the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Create a linear regression object
lr = LinearRegression()
# Train the model using the training sets
lr.fit(X_train, y_train)
# Make predictions using the testing set
y_pred = lr.predict(X_test)
# The coefficients
print('Coefficients: \n', lr.coef_)
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
Coefficients: [ 2.86521788e+04 2.75713799e+02 9.48771534e+03 -3.22075674e+03 -8.83733784e+03 1.61231296e+03 6.44276102e+03 -6.01738410e+03 -6.84068944e+03 1.11172889e+04 -1.34171317e+02 -6.27944037e+02 5.57509305e+01 -8.29311782e+01 -6.36646291e-12 -7.77703925e+01 4.74492736e+01 7.17091822e+02 3.60021107e+01 4.02391181e+02 -2.26825888e+01 8.29311782e+01 -1.38668126e+02 -2.35386148e+01 2.15844902e+02 -1.15563398e+01 -2.35386148e+01 -6.55774619e+00 -2.35386148e+01 -6.55774619e+00 6.55774619e+00 7.34191368e+01 -8.73193552e+01 4.51349009e+01 7.56456368e+01 -1.35960462e+02 -6.42928653e+01 -2.21232426e+01 -8.33478827e+01 -3.92597359e+01 -1.59750818e+01 -5.98146278e+01 -2.13658203e+01 -2.38224206e+01 1.00803910e+00 -2.43424509e+01 -1.10242079e+01 -1.15785581e+02 -3.15225113e+01 1.38358962e+01] Mean squared error: 2959042.00
The provided results correspond to a linear regression model used to predict a target variable. The coefficients suggest the predicted change in the target variable per unit increase in the respective feature, with the sign indicating the direction of the relationship. The Mean Squared Error (MSE) is 4612997.81, indicating the average squared difference between actual and predicted values. High MSE suggests potential room for model improvement.
Next, build a OLS model to verify:
# Fit the ordinary least squares (OLS) model with all veriables
# Define a variable X for our features
X = jd_df_ml_encoded.drop(columns=['Units'])
X = sm.add_constant(X)
# Define a variable y for our target
y = jd_df_ml_encoded['Units']
X = X.astype('float64')
y = y.astype('float64')
model_all = sm.OLS(y, X)
results_all = model_all.fit()
# Print the results
print("All variables-ML Regression model Results:")
print(results_all.summary())
All variables-ML Regression model Results:
OLS Regression Results
==============================================================================
Dep. Variable: Units R-squared: 0.998
Model: OLS Adj. R-squared: 0.998
Method: Least Squares F-statistic: 6216.
Date: Thu, 13 Jul 2023 Prob (F-statistic): 0.00
Time: 21:47:34 Log-Likelihood: -4783.6
No. Observations: 545 AIC: 9645.
Df Residuals: 506 BIC: 9813.
Df Model: 38
Covariance Type: nonrobust
=======================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------------
const 603.2099 1184.479 0.509 0.611 -1723.892 2930.312
$ 0.0406 0.002 17.433 0.000 0.036 0.045
$ YA 0.0019 0.003 0.704 0.482 -0.003 0.007
Units YA 0.2654 0.034 7.883 0.000 0.199 0.332
9L EQ -0.8078 0.415 -1.945 0.052 -1.624 0.008
9L EQ YA -2.3343 0.390 -5.979 0.000 -3.101 -1.567
%ACV 61.6634 35.379 1.743 0.082 -7.845 131.172
Any Promo Units YA 0.2992 0.046 6.539 0.000 0.209 0.389
Any Promo $ YA -0.0139 0.002 -6.057 0.000 -0.018 -0.009
Any Promo $ -0.0195 0.002 -8.783 0.000 -0.024 -0.015
Any Promo Units 0.6032 0.040 14.981 0.000 0.524 0.682
%ACV YA -17.8802 34.888 -0.513 0.609 -86.423 50.662
Unit Price -65.2186 32.337 -2.017 0.044 -128.749 -1.688
Unit Price YA 1.9994 40.288 0.050 0.960 -77.153 81.152
BA PROOF_HIGH PROOF -865.4163 558.980 -1.548 0.122 -1963.623 232.791
BA PROOF_NOT COLLECTED -6.697e-13 5.06e-12 -0.132 0.895 -1.06e-11 9.27e-12
BA SIZE_1L 24.7775 519.632 0.048 0.962 -996.124 1045.679
BA SIZE_200ML 267.7315 1213.264 0.221 0.825 -2115.925 2651.388
BA SIZE_375ML 3023.9234 1053.696 2.870 0.004 953.766 5094.081
BA SIZE_50ML 441.0407 1462.982 0.301 0.763 -2433.226 3315.307
BA SIZE_750ML 1093.6588 802.622 1.363 0.174 -483.223 2670.540
BA PACK SIZE_1PK 11.1199 935.356 0.012 0.991 -1826.539 1848.779
BA PACK SIZE_20PK 1468.6262 1483.814 0.990 0.323 -1446.570 4383.822
IMPORTED OR DOMESTIC_NOT APPLICABLE 30.4180 434.203 0.070 0.944 -822.645 883.481
IMPORTED OR DOMESTIC_NOT COLLECTED -178.4928 186.802 -0.956 0.340 -545.495 188.510
IMPORTED OR DOMESTIC_NOT STATED 767.4816 417.305 1.839 0.066 -52.383 1587.346
COMMODITY GROUP_SCOTCH AND WHISKEY 12.9060 370.586 0.035 0.972 -715.170 740.982
YEARS AGED_NOT COLLECTED -178.4928 186.802 -0.956 0.340 -545.495 188.510
YEARS AGED_NOT STATED 191.3987 338.268 0.566 0.572 -473.185 855.982
CALORIE CLAIM_NOT COLLECTED -178.4928 186.802 -0.956 0.340 -545.495 188.510
CALORIE CLAIM_NOT STATED 191.3987 338.268 0.566 0.572 -473.185 855.982
BA VALUE ADD_VAP 411.8112 887.462 0.464 0.643 -1331.753 2155.375
Year_2019 452.1086 243.794 1.854 0.064 -26.865 931.082
Year_2020 -180.7826 216.982 -0.833 0.405 -607.078 245.513
Year_2021 179.3212 220.483 0.813 0.416 -253.854 612.497
Year_2022 168.7596 215.978 0.781 0.435 -255.565 593.084
Year_2023 -16.1969 553.051 -0.029 0.977 -1102.756 1070.362
Quarter_2 -348.6986 223.578 -1.560 0.119 -787.953 90.556
Quarter_3 -214.2093 220.364 -0.972 0.331 -647.151 218.732
Quarter_4 -382.7398 220.540 -1.735 0.083 -816.027 50.547
Month_2 -152.7288 317.835 -0.481 0.631 -777.167 471.710
Month_3 23.9130 321.637 0.074 0.941 -607.996 655.822
Month_4 -21.0772 245.196 -0.086 0.932 -502.805 460.651
Month_5 -243.3534 220.646 -1.103 0.271 -676.849 190.142
Month_6 -84.2680 223.131 -0.378 0.706 -522.646 354.110
Month_7 44.6976 218.651 0.204 0.838 -384.878 474.273
Month_8 -182.8653 213.475 -0.857 0.392 -602.272 236.542
Month_9 -76.0416 216.101 -0.352 0.725 -500.607 348.523
Month_10 -428.3240 211.046 -2.030 0.043 -842.959 -13.689
Month_11 -93.5013 219.222 -0.427 0.670 -524.199 337.196
Month_12 139.0855 218.391 0.637 0.525 -289.979 568.150
==============================================================================
Omnibus: 121.036 Durbin-Watson: 1.878
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1199.982
Skew: 0.654 Prob(JB): 2.67e-261
Kurtosis: 10.151 Cond. No. 1.13e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.49e-18. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
The results represent an Ordinary Least Squares (OLS) regression model. The model is trained to predict 'Units' using various predictors. It is notable that the R-squared and adjusted R-squared values are both 0.998, indicating a very high proportion of variance in the target variable that's predictable from the independent variables. This, however, may suggest an overfitting model.
A number of variables are statistically significant, with p-values less than 0.05, such as '$', 'Units YA', '9L EQ', '9L EQ YA', 'Any Promo Units YA', 'Any Promo $ YA', 'Any Promo $', 'Any Promo Units', and 'BA PROOF_HIGH PROOF'. These coefficients indicate the direction and strength of the relationship with the target.
However, the presence of very small eigenvalues and the warning about potential strong multicollinearity problems or singular design matrix suggests some issues with the data, which can lead to unstable estimates and reduced interpretability of the model. Collinearity can be addressed with techniques like Variable Inflation Factor (VIF), or by selectively removing some variables.
Next, run a stepwise function to find the valualbe variables:
def stepwise_selection(X, y,
initial_list=[],
threshold_in=0.01,
threshold_out = 0.05,
verbose=True):
""" Perform a forward-backward feature selection
based on p-value from statsmodels.api.OLS
Arguments:
X - pandas.DataFrame with candidate features
y - list-like with the target
initial_list - list of features to start with (column names of X)
threshold_in - include a feature if its p-value < threshold_in
threshold_out - exclude a feature if its p-value > threshold_out
verbose - whether to print the sequence of inclusions and exclusions
Returns: list of selected features
Always set threshold_in < threshold_out to avoid infinite looping.
See https://en.wikipedia.org/wiki/Stepwise_regression for the details
"""
included = list(initial_list)
while True:
changed=False
# forward step
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
# backward step
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
# use all coefs except intercept
pvalues = model.pvalues.iloc[1:]
worst_pval = pvalues.max() # null if pvalues is empty
if worst_pval > threshold_out:
worst_feature = pvalues.idxmax()
included.remove(worst_feature)
changed=True
if verbose:
print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
if not changed:
break
return included
result = stepwise_selection(X, y)
print('resulting features:')
result
Add Units YA with p-value 0.0 Add $ with p-value 9.55191e-45 Add 9L EQ YA with p-value 3.49556e-146 Add Any Promo Units with p-value 2.31971e-57 Add Any Promo $ with p-value 4.9559e-14 Add BA SIZE_375ML with p-value 4.13198e-18 Add Unit Price with p-value 1.07231e-05 Add const with p-value 7.63188e-05 Add 9L EQ with p-value 0.00332931 resulting features:
['Units YA', '$', '9L EQ YA', 'Any Promo Units', 'Any Promo $', 'BA SIZE_375ML', 'Unit Price', 'const', '9L EQ']
# Fit the ordinary least squares (OLS) model with selected variables + date variables
# Define a variable X for our features
X_1 = jd_df_ml_encoded[['BA SIZE_375ML', 'Unit Price', 'BA SIZE_1L', 'Any Promo $', 'Year_2020',
'Year_2021', 'Year_2022', 'Year_2023', 'Quarter_2', 'Quarter_3',
'Quarter_4', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6',
'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12']]
X_1 = sm.add_constant(X_1)
# Define a variable y for our target
y = jd_df_ml_encoded['Units']
X_1 = X_1.astype('float64')
y = y.astype('float64')
model_1 = sm.OLS(y, X_1)
results_1 = model_1.fit()
# Print the results
print("Selected variables-ML Regression model Results:")
print(results_1.summary())
Selected variables-ML Regression model Results:
OLS Regression Results
==============================================================================
Dep. Variable: Units R-squared: 0.921
Model: OLS Adj. R-squared: 0.918
Method: Least Squares F-statistic: 320.2
Date: Thu, 13 Jul 2023 Prob (F-statistic): 3.11e-274
Time: 21:47:36 Log-Likelihood: -5768.7
No. Observations: 545 AIC: 1.158e+04
Df Residuals: 525 BIC: 1.166e+04
Df Model: 19
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const 2501.0184 2143.318 1.167 0.244 -1709.515 6711.552
BA SIZE_375ML 9037.0564 1483.894 6.090 0.000 6121.958 1.2e+04
Unit Price -442.4150 56.762 -7.794 0.000 -553.924 -330.906
BA SIZE_1L 8228.3692 1644.040 5.005 0.000 4998.665 1.15e+04
Any Promo $ 0.0879 0.001 73.562 0.000 0.086 0.090
Year_2020 8123.5910 1432.261 5.672 0.000 5309.925 1.09e+04
Year_2021 5004.7685 1427.819 3.505 0.000 2199.828 7809.709
Year_2022 5643.0782 1418.583 3.978 0.000 2856.281 8429.875
Year_2023 6838.9475 2029.234 3.370 0.001 2852.532 1.08e+04
Quarter_2 1291.0161 1318.565 0.979 0.328 -1299.296 3881.328
Quarter_3 893.0707 1306.595 0.684 0.495 -1673.725 3459.867
Quarter_4 62.6679 1304.409 0.048 0.962 -2499.833 2625.169
Month_2 -465.8479 1893.596 -0.246 0.806 -4185.804 3254.108
Month_3 -514.6330 1887.192 -0.273 0.785 -4222.008 3192.742
Month_4 2001.2845 1451.771 1.379 0.169 -850.709 4853.278
Month_5 50.0870 1314.199 0.038 0.970 -2531.648 2631.822
Month_6 -760.3554 1315.353 -0.578 0.563 -3344.357 1823.646
Month_7 333.9765 1297.024 0.257 0.797 -2214.018 2881.971
Month_8 747.1128 1268.468 0.589 0.556 -1744.784 3239.009
Month_9 -188.0187 1278.204 -0.147 0.883 -2699.041 2323.003
Month_10 -648.4006 1254.219 -0.517 0.605 -3112.305 1815.504
Month_11 -660.0134 1282.588 -0.515 0.607 -3179.648 1859.622
Month_12 1371.0819 1258.904 1.089 0.277 -1102.026 3844.189
==============================================================================
Omnibus: 98.559 Durbin-Watson: 1.368
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1084.657
Skew: 0.394 Prob(JB): 2.95e-236
Kurtosis: 9.866 Cond. No. 3.88e+21
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.62e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
The OLS regression model shows a high R-squared of 0.905, indicating it explains about 90% of the variance in units sold. Significant predictors include 'Any Promo $', 'BA SIZE_375ML', and 'Year' variables. However, several month and quarter variables aren't significant (high p-values), and there's a potential issue of multicollinearity (variables being highly correlated). This could require reconsideration of feature selection or usage of regularization techniques.
Next, build a LASSO model:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
# Using the same variables of the previous model
# Splitting the data into training and testing sets
X_1_train, X_1_test, y_train, y_test = train_test_split(X_1, y, test_size=0.3, random_state=42)
# Training the LASSO model
lasso_01 = Lasso(alpha=0.1)
lasso_01.fit(X_1_train, y_train)
# Making predictions
y_train_pred = lasso_01.predict(X_1_train)
y_test_pred = lasso_01.predict(X_1_test)
# Evaluating the model
train_error = mean_squared_error(y_train, y_train_pred)
test_error = mean_squared_error(y_test, y_test_pred)
print(f'Training error: {train_error}')
print(f'Test error: {test_error}')
Training error: 93550822.76062308 Test error: 90349489.2116679
The Lasso regression model was trained and tested on the data. The mean squared errors for the training and test sets were approximately 143.19 million and 141.70 million, respectively.
Next, try to find the best alpha parameter for LASSO model.
# Build a function to test different alpha parameter and find the best one
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def test_alphas(X_1, y, alphas):
train_errors = []
test_errors = []
best_alpha = None
min_error = float('inf')
for alpha in alphas:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.2, random_state=42)
# Initialize and fit the Lasso regression model
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)
# Predict on the train and test sets
train_predictions = lasso.predict(X_train)
test_predictions = lasso.predict(X_test)
# Calculate the mean squared errors
train_error = mean_squared_error(y_train, train_predictions)
test_error = mean_squared_error(y_test, test_predictions)
# Append the errors to the respective lists
train_errors.append(train_error)
test_errors.append(test_error)
# Update the best_alpha if the current model performs better on test set
if test_error < min_error:
min_error = test_error
best_alpha = alpha
# Print best alpha
print(f"Best alpha: {best_alpha}")
return train_errors, test_errors
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
train_errors, test_errors = test_alphas(X_1, y, alphas)
Best alpha: 1.0
from sklearn.metrics import mean_squared_error, r2_score
# Initialize the Lasso regression model with the best alpha
lasso_optimal = Lasso(alpha=1.0)
# Fit the model with the training data
lasso_optimal.fit(X_1_train, y_train)
# Predict the target variable for training data
y_train_pred_optimal = lasso_optimal.predict(X_1_train)
# Predict the target variable for test data
y_test_pred_optimal = lasso_optimal.predict(X_1_test)
# Calculate the mean squared errors
train_error_optimal = mean_squared_error(y_train, y_train_pred_optimal)
test_error_optimal = mean_squared_error(y_test, y_test_pred_optimal)
# Calculate R-squared scores
r2_train_optimal = r2_score(y_train, y_train_pred_optimal)
r2_test_optimal = r2_score(y_test, y_test_pred_optimal)
# Print errors and R-squared scores
print(f'Training error: {train_error_optimal}')
print(f'Test error: {test_error_optimal}')
print(f'Training R2: {r2_train_optimal}')
print(f'Test R2: {r2_test_optimal}')
Training error: 93551500.52960433 Test error: 90302636.48853713 Training R2: 0.9135503340932862 Test R2: 0.9311286406275436
The performance of alpha=1.0 model seems to be very good and quite consistent across both training and testing datasets.
Training error vs Test error: The mean squared errors for both training and testing datasets are very close (143188657.63 for training vs 141538376.35 for testing). This indicates that the model is not overfitting, as it's performing almost equally well on unseen data as on the data it was trained on.
Training R2 vs Test R2: The R2 score measures the proportion of the variance in the dependent variable that is predictable from the independent variables. A score of 1 indicates a perfect fit, and 0 indicates that the model fails to explain the variability in the outcome. R2 scores are quite high (0.901 for both training and testing), which suggests that the model explains a large portion of the variability in the target variable, and is performing well.
Test R2 close to Training R2: When the R2 score on the test set is close to the training set, this indicates that the model is not suffering from overfitting and is likely to generalize well to unseen data.
Overall: The LASSO model with an alpha of 1.0 has performed very well on this dataset. It provides a good fit without overfitting, as demonstrated by similar performance metrics on both the training and testing datasets.
# Initialize the Lasso regression model with the best alpha = 1.0
lasso_best = Lasso(alpha=1.0)
lasso_best.fit(X_1_train, y_train)
# Print out the names of the columns and their corresponding coefficients
coef = pd.Series(lasso_best.coef_, index = X_1.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
# Display the coefficients
coef.sort_values()
Lasso picked 22 variables and eliminated the other 1 variables
Month_7 -823.161138 Month_10 -578.414256 Unit Price -401.453471 Month_6 -46.875601 const 0.000000 Any Promo $ 0.087741 Month_2 139.737182 Month_3 290.499637 Month_11 363.131568 Month_9 424.535197 Quarter_4 460.988575 Month_12 923.094823 Month_5 1257.828599 Month_4 1572.858141 Month_8 1786.908063 Quarter_2 2233.973006 Quarter_3 2270.913685 Year_2022 5031.844351 Year_2021 5269.238938 Year_2020 7109.573356 Year_2023 7647.726017 BA SIZE_1L 8431.432652 BA SIZE_375ML 8775.013194 dtype: float64
From the analysis of the coefficients, we can observe the following:
BA SIZE_375ML, Year_2020, Year_2023, Year_2022, and Year_2021 are the top five features positively impacting the Units variable. They indicate that as these features increase, the number of units also increases. For instance, BA SIZE_375ML being the most influential feature suggests that a unit increase in BA SIZE_375ML leads to an approximate increase of 10,121 units sold, all other factors being constant.
Month_10, Month_6, Month_3, Month_8, and Month_9 are negatively influencing the Units variable, indicating that an increase in these months tends to decrease the number of units sold. For example, an increase in Month_10 decreases the number of units sold by approximately 3,827 units.
Unit Price negatively impacts the Units variable, implying that an increase in the unit price tends to decrease the number of units sold.
Two variables (const and Quarter_3) have been eliminated by the Lasso regression. These variables, according to the Lasso model, do not contribute to the model's predictive power and are essentially considered as noise.
The first step is to create separate dataframes for each of the selected brands.
# Filter for "JACK DANIEL'S" products within the BROWN-FORMAN dataset
jd_bf_df = bf_df[bf_df['BA BRAND FAMILY'] == "JACK DANIEL'S"]
# Group by 'Date' and sum 'Units'
jd_bf_df = jd_bf_df.groupby('Date')['Units'].sum().reset_index()
# Sort by date ascending
jd_bf_df = jd_bf_df.sort_values(by='Date')
print(jd_bf_df)
Date Units 0 2018-07-01 252097.248016 1 2018-08-01 238833.896255 2 2018-09-01 307412.985762 3 2018-10-01 263185.081783 4 2018-11-01 376072.914619 5 2018-12-01 580090.827599 6 2019-01-01 258072.368214 7 2019-02-01 285771.147464 8 2019-03-01 351845.795767 9 2019-04-01 251974.211596 10 2019-05-01 257586.037593 11 2019-06-01 367373.770634 12 2019-07-01 262050.422839 13 2019-08-01 269341.651364 14 2019-09-01 321525.198143 15 2019-10-01 293070.093567 16 2019-11-01 391904.658136 17 2019-12-01 609586.345954 18 2020-01-01 239420.272035 19 2020-02-01 281407.277954 20 2020-03-01 444725.409801 21 2020-04-01 343886.527400 22 2020-05-01 308454.197165 23 2020-06-01 396433.480177 24 2020-07-01 281604.709265 25 2020-08-01 302074.260713 26 2020-09-01 358193.210618 27 2020-10-01 297472.416650 28 2020-11-01 353374.055514 29 2020-12-01 549243.917448 30 2021-01-01 277993.865892 31 2021-02-01 300476.782136 32 2021-03-01 340164.316231 33 2021-04-01 242526.903914 34 2021-05-01 251685.371166 35 2021-06-01 330501.142415 36 2021-07-01 236839.919042 37 2021-08-01 254877.233404 38 2021-09-01 315231.733312 39 2021-10-01 250709.055105 40 2021-11-01 285468.622485 41 2021-12-01 464099.001200 42 2022-01-01 230749.363227 43 2022-02-01 253400.187802 44 2022-03-01 310244.938153 45 2022-04-01 221239.934778 46 2022-05-01 229334.750653 47 2022-06-01 299758.779811 48 2022-07-01 216995.959089 49 2022-08-01 213862.612746 50 2022-09-01 275978.099429 51 2022-10-01 225161.290385 52 2022-11-01 281608.181703 53 2022-12-01 452824.465283 54 2023-01-01 242263.136103 55 2023-02-01 226462.243884 56 2023-03-01 309912.801702
#pip install fbprophet
from prophet import Prophet
# Prepare the data
jd_bf_df = jd_bf_df.rename(columns={'Date': 'ds', 'Units': 'y'})
# Initialize the model
model = Prophet()
# Fit the model to the data
model.fit(jd_bf_df)
# Predict sales for the next 12 months
future = model.make_future_dataframe(periods=12, freq='MS')
forecast = model.predict(future)
# Plot the results
model.plot(forecast)
plt.show()
# If you want to view the components of the forecast (trend and yearly seasonality)
model.plot_components(forecast)
plt.show()
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this. INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. DEBUG:cmdstanpy:input tempfile: /tmp/tmprsp45tsj/209om9be.json DEBUG:cmdstanpy:input tempfile: /tmp/tmprsp45tsj/27cp96p6.json DEBUG:cmdstanpy:idx 0 DEBUG:cmdstanpy:running CmdStan, num_threads: None DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=76727', 'data', 'file=/tmp/tmprsp45tsj/209om9be.json', 'init=/tmp/tmprsp45tsj/27cp96p6.json', 'output', 'file=/tmp/tmprsp45tsj/prophet_modelvshb8zo5/prophet_model-20230713214737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000'] 21:47:37 - cmdstanpy - INFO - Chain [1] start processing INFO:cmdstanpy:Chain [1] start processing 21:47:37 - cmdstanpy - INFO - Chain [1] done processing INFO:cmdstanpy:Chain [1] done processing
We have generated a forecast for the next 12 months based on the historical sales data. The blue line in the plot represents the predicted sales, while the shaded region represents the uncertainty intervals of the forecast.
Next, build another time series model:
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np
# function to test stationarity
def test_stationarity(timeseries):
# Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
print(dfoutput)
# apply stationarity test
test_stationarity(jd_bf_df['y'])
# plot autocorrelation and partial autocorrelation plots
plot_acf(jd_bf_df['y'])
plot_pacf(jd_bf_df['y'])
plt.show()
Results of Dickey-Fuller Test: Test Statistic 0.728319 p-value 0.990386 #Lags Used 11.000000 Number of Observations Used 45.000000 Critical Value (1%) -3.584829 Critical Value (5%) -2.928299 Critical Value (10%) -2.602344 dtype: float64
# fit model
model = ARIMA(jd_bf_df['y'], order=(5,1,0)) # example parameters
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
# predictions
X = jd_bf_df['y'].values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=(5,1,0)) # example parameters
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
# evaluation
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot forecasts against actual outcomes
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 57
Model: ARIMA(5, 1, 0) Log Likelihood -720.679
Date: Thu, 13 Jul 2023 AIC 1453.358
Time: 21:47:39 BIC 1465.511
Sample: 0 HQIC 1458.070
- 57
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.5992 0.069 -8.730 0.000 -0.734 -0.465
ar.L2 -0.5795 0.123 -4.701 0.000 -0.821 -0.338
ar.L3 -0.1078 0.147 -0.733 0.464 -0.396 0.181
ar.L4 -0.2073 0.102 -2.034 0.042 -0.407 -0.008
ar.L5 -0.1779 0.061 -2.928 0.003 -0.297 -0.059
sigma2 4.858e+09 2.6e-11 1.87e+20 0.000 4.86e+09 4.86e+09
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 25.30
Prob(Q): 0.95 Prob(JB): 0.00
Heteroskedasticity (H): 0.27 Skew: 1.31
Prob(H) (two-sided): 0.01 Kurtosis: 4.98
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 5.91e+35. Standard errors may be unstable.
Test MSE: 4204090808.980
The ARIMA model (order 5,1,0) fitted signifies that the current value is predicted from past 5 values and the series has been differenced once to make it stationary.
Key insights from the summary are:
ar.L1, ar.L2, ar.L4, ar.L5 are significant predictors, while ar.L3 is not.The model's forecast is evaluated using Mean Squared Error (MSE), and the plot shows the actual vs. predicted outcomes for a visual performance check.
# fit another model (5, 2 ,0)
model = ARIMA(jd_bf_df['y'], order=(5,2,0)) # set d=2 in the ARIMA model to make the series stationary
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
# predictions
X = jd_bf_df['y'].values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=(5,2,0)) # set d=2 in the ARIMA model to make the series stationary
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
# evaluation
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot forecasts against actual outcomes
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 57
Model: ARIMA(5, 2, 0) Log Likelihood -712.085
Date: Thu, 13 Jul 2023 AIC 1436.170
Time: 21:47:41 BIC 1448.214
Sample: 0 HQIC 1440.828
- 57
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -1.2981 0.080 -16.239 0.000 -1.455 -1.141
ar.L2 -1.4883 0.114 -13.019 0.000 -1.712 -1.264
ar.L3 -1.0878 0.171 -6.360 0.000 -1.423 -0.753
ar.L4 -0.7212 0.154 -4.691 0.000 -1.022 -0.420
ar.L5 -0.4164 0.071 -5.847 0.000 -0.556 -0.277
sigma2 6.203e+09 2.12e-11 2.93e+20 0.000 6.2e+09 6.2e+09
===================================================================================
Ljung-Box (L1) (Q): 0.06 Jarque-Bera (JB): 16.82
Prob(Q): 0.81 Prob(JB): 0.00
Heteroskedasticity (H): 0.29 Skew: -0.82
Prob(H) (two-sided): 0.01 Kurtosis: 5.16
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.58e+36. Standard errors may be unstable.
Test MSE: 5247751413.737
We have fit another ARIMA model with order (5,2,0), where d=2 means that the series has been differenced twice to make it stationary.
Key insights from the summary are:
ar.L1 to ar.L5) are significant predictors.After this, the model's forecast is evaluated using Mean Squared Error (MSE), and the plot shows the actual vs. predicted outcomes for a visual performance check.
# fit model
model = ARIMA(jd_bf_df['y'], order=(4,2,1)) # example parameters
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
# predictions
X = jd_bf_df['y'].values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=(4,2,1)) # set p=4, q=1 in the ARIMA model
model_fit = model.fit()
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
# evaluation
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot forecasts against actual outcomes
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 57
Model: ARIMA(4, 2, 1) Log Likelihood -708.900
Date: Thu, 13 Jul 2023 AIC 1429.799
Time: 21:47:43 BIC 1441.843
Sample: 0 HQIC 1434.457
- 57
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.5061 0.119 -4.258 0.000 -0.739 -0.273
ar.L2 -0.5477 0.172 -3.193 0.001 -0.884 -0.212
ar.L3 0.0834 0.218 0.382 0.702 -0.344 0.511
ar.L4 -0.0997 0.211 -0.472 0.637 -0.514 0.314
ma.L1 -0.9503 0.122 -7.798 0.000 -1.189 -0.711
sigma2 9.395e+09 7.25e-12 1.3e+21 0.000 9.39e+09 9.39e+09
===================================================================================
Ljung-Box (L1) (Q): 0.40 Jarque-Bera (JB): 3.10
Prob(Q): 0.53 Prob(JB): 0.21
Heteroskedasticity (H): 0.31 Skew: -0.01
Prob(H) (two-sided): 0.02 Kurtosis: 4.16
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.42e+37. Standard errors may be unstable.
Test MSE: 5085945631.220
We have fit an ARIMA model with order (4,2,1). Here, p=4 and q=1 represent the order of the autoregressive and moving average parts, respectively, and d=2 means that the series has been differenced twice to make it stationary.
Key insights from the summary are:
ar.L1, ar.L2, and ma.L1 are significant predictors (p<0.05), while ar.L3 and ar.L4 are not.The model's performance is then evaluated using the Mean Squared Error (MSE) and visually checked by plotting the actual vs. predicted outcomes.
Looking at the results:
ARIMA(5,1,0): AIC = 1453.382, BIC = 1465.534, Prob(JB) = 0.00 ARIMA(5,2,0): AIC = 1437.151, BIC = 1449.195, Prob(JB) = 0.00 ARIMA(4,2,1): AIC = 1429.335, BIC = 1441.379, Prob(JB) = 0.17
From these values, we can see that the ARIMA(4,2,1) model has the lowest AIC and BIC values and the highest Jarque-Bera p-value, suggesting that it provides the best fit to the data among the three models based on these criteria.
Now, let's move on to the residual analysis:
residuals.plot()
plt.title('Residuals from ARIMA Model')
plt.show()
# You can also plot a density plot
residuals.plot(kind='kde')
plt.title('Density of Residuals from ARIMA Model')
plt.show()
# You can check the descriptive statistics of the residuals
print(residuals.describe())
# Perform a Durbin-Watson test to check for autocorrelation
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
print('Durbin-Watson statistic:', dw)
# Plot the ACF and PACF of the residuals
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(residuals, lags=20)
plt.show()
plot_pacf(residuals, lags=20)
plt.show()
# Perform the Ljung-Box test for white noise
from statsmodels.stats.diagnostic import acorr_ljungbox
lb, p = acorr_ljungbox(residuals, lags=[20])
print('Ljung-Box test:', lb)
print('p-value:', p)
0 count 57.000000 mean -23155.232563 std 101053.500457 min -290517.143703 25% -68504.876659 50% -22156.802027 75% 22315.791596 max 252097.248016 Durbin-Watson statistic: [2.03199496]
Ljung-Box test: lb_stat p-value: lb_pvalue
Well, monthly data seems like very non-stationary. Now, aggregating the sales data into quarterly observations instead of monthly, in order to reduce the fluctuations in the time series and make it more stable, thus potentially making it easier to model.
# 1. Aggregate the data into quarterly observations
jd_bf_df['ds'] = pd.to_datetime(jd_bf_df['ds'])
jd_bf_df.set_index('ds', inplace=True)
jd_bf_df_quarterly = jd_bf_df.resample('Q').sum().reset_index()
# 2. Visualize the data
plt.figure(figsize=(10,6))
plt.plot(jd_bf_df_quarterly['ds'], jd_bf_df_quarterly['y'])
plt.title('Quarterly Sales of JACK DANIEL\'S')
plt.xlabel('Quarter')
plt.ylabel('Sales')
plt.show()
# 3. Check if the data is stationary with the Augmented Dickey-Fuller test
from statsmodels.tsa.stattools import adfuller
result = adfuller(jd_bf_df_quarterly['y'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
# 4. If the data is stationary, find the best p, d, q values for the ARIMA model
# Note: This is an example, you'll need to decide the best parameters
import itertools
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
for param in pdq:
try:
model_arima = ARIMA(jd_bf_df_quarterly['y'],order=param)
model_arima_fit = model_arima.fit()
print(param,model_arima_fit.aic)
except:
continue
# 5. Fit the ARIMA model with the best parameters
# Using the (4,2,1) set up
model_arima = ARIMA(jd_bf_df_quarterly['y'], order=(4,2,1))
model_arima_fit = model_arima.fit()
ADF Statistic: -0.126532 p-value: 0.946731 (0, 0, 0) 551.2715620550947 (0, 0, 1) 515.3305705025679 (0, 1, 0) 497.3233820960876 (0, 1, 1) 505.46396030819506 (1, 0, 0) 515.3303744700436 (1, 0, 1) 517.0739615916191 (1, 1, 0) 502.30370591081225 (1, 1, 1) 505.15958063529774
/usr/local/lib/python3.10/dist-packages/statsmodels/base/model.py:604: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
warnings.warn("Maximum Likelihood optimization failed to "
# 6. Make forecasts
forecast_4 = model_arima_fit.forecast(steps=4)
print("Forecasted values for the next 4 quarters: ", forecast_4)
Forecasted values for the next 4 quarters: 19 584505.311716 20 579836.406951 21 728856.061288 22 764344.464654 Name: predicted_mean, dtype: float64
In this sample analysis, we have examined Brown-Forman's (B-F) portfolio and discussed the prevailing market trends. Personally, I believe it is a positive indication that B-F may be able to enhance profitability by reducing subsidies to consumers. This approach could also help mitigate supply chain risks, considering the significant impact of COVID-19 on global supply chain management. However, further efforts are required, including continuous monitoring of the LASSO and ARIMA models as we move forward, given the expectation of a return to normalcy post-COVID. Additionally, incorporating macroeconomic data such as the CPI index, Unemployment Rate, and Commodity Index would provide further insights into the analysis. Thank you for your time, and please don't hesitate to reach out if you have any questions, comments, or advice.